numpymatplotlib/seabornplotlyimport these into our programimport matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns
sns.set_style('darkgrid')
sns.set_context('poster')
colours = sns.color_palette()
import pandas as pd
import plotly
import plotly.graph_objs as go
plotly.offline.init_notebook_mode(connected=True)
sns.set_palette(colours)
an open source, BSD-licensed library providing high-performance, easy-to-use data structures and data analysis tools for the Python programming language.
DataFrame, which is a bit like an Excel worksheet - it uses columns and row to store data in a tabular formatsitename is not the same as SiteName)original_data = pd.read_csv('VBV_Data\VBV_20170131_JTC00567_shay_2940.csv', low_memory=False)
original_data.head(15)
| RecordedDataTime | SiteName | SpeedMPH | Class | Length(Metres) | Chasis Ht | Info | Gap | Lane | Direction | |
|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 2016-06-01 00:01:04 | JTC00567 | 68 | 2 | 4 | NaN | Car / Small Van <5.2 metres | NaN | 1 | S |
| 1 | 2016-06-01 00:01:09 | JTC00567 | 64 | 2 | 4 | NaN | Car / Small Van <5.2 metres | 49.0 | 1 | S |
| 2 | 2016-06-01 00:01:12 | JTC00567 | 57 | 5 | 18 | NaN | Rigid HGV + Trailer / Standard Articulated (Ca... | NaN | 1 | N |
| 3 | 2016-06-01 00:01:25 | JTC00567 | 76 | 4 | 6 | NaN | Rigid HGV / Large Van | 158.0 | 1 | S |
| 4 | 2016-06-01 00:02:30 | JTC00567 | 58 | 4 | 8 | NaN | Rigid HGV / Large Van | NaN | 2 | N |
| 5 | 2016-06-01 00:02:30 | JTC00567 | 57 | 5 | 16 | NaN | Rigid HGV + Trailer / Standard Articulated (Ca... | 774.0 | 1 | N |
| 6 | 2016-06-01 00:03:05 | JTC00567 | 53 | 5 | 15 | NaN | Rigid HGV + Trailer / Standard Articulated (Ca... | 1002.0 | 1 | S |
| 7 | 2016-06-01 00:04:41 | JTC00567 | 58 | 5 | 18 | NaN | Rigid HGV + Trailer / Standard Articulated (Ca... | 950.0 | 1 | S |
| 8 | 2016-06-01 00:04:44 | JTC00567 | 77 | 2 | 4 | NaN | Car / Small Van <5.2 metres | 1335.0 | 1 | N |
| 9 | 2016-06-01 00:04:53 | JTC00567 | 66 | 2 | 3 | NaN | Car / Small Van <5.2 metres | 110.0 | 1 | S |
| 10 | 2016-06-01 00:04:58 | JTC00567 | 57 | 5 | 16 | NaN | Rigid HGV + Trailer / Standard Articulated (Ca... | 138.0 | 1 | N |
| 11 | 2016-06-01 00:05:08 | JTC00567 | 51 | 5 | 16 | NaN | Rigid HGV + Trailer / Standard Articulated (Ca... | 146.0 | 1 | S |
| 12 | 2016-06-01 00:05:10 | JTC00567 | 52 | 2 | 4 | NaN | Car / Small Van <5.2 metres | 21.0 | 1 | S |
| 13 | 2016-06-01 00:05:50 | JTC00567 | 61 | 2 | 4 | NaN | Car / Small Van <5.2 metres | 507.0 | 1 | N |
| 14 | 2016-06-01 00:06:04 | JTC00567 | 48 | 5 | 17 | NaN | Rigid HGV + Trailer / Standard Articulated (Ca... | 141.0 | 1 | N |
pandas, we can start to manipulate it.pandas) can manipulate dates and times, if they're the correct kind of object.# Create a "Count" column, which will be 1 for each record to begin wth
original_data['Count'] = 1
# Make sure dates are used as dates, and set them as the index so we can summarise
original_data['DateTime'] = pd.to_datetime(original_data['RecordedDataTime'])
original_data.set_index('DateTime', inplace=True)
# Work out the vehicle count per class and direction for each hour
hourly_counts = original_data.groupby([pd.TimeGrouper('1H'), 'Info'])['Count'].count().reset_index()
hourly_counts.head(15)
| DateTime | Info | Count | |
|---|---|---|---|
| 0 | 2016-06-01 00:00:00 | Car / Small Van <5.2 metres | 48 |
| 1 | 2016-06-01 00:00:00 | Car / Small Van <5.2 metres with Trailer | 2 |
| 2 | 2016-06-01 00:00:00 | Rigid HGV + Trailer / Standard Articulated (Ca... | 27 |
| 3 | 2016-06-01 00:00:00 | Rigid HGV / Large Van | 11 |
| 4 | 2016-06-01 01:00:00 | Car / Small Van <5.2 metres | 35 |
| 5 | 2016-06-01 01:00:00 | Car / Small Van <5.2 metres with Trailer | 3 |
| 6 | 2016-06-01 01:00:00 | Rigid HGV + Trailer / Standard Articulated (Ca... | 31 |
| 7 | 2016-06-01 01:00:00 | Rigid HGV / Large Van | 9 |
| 8 | 2016-06-01 02:00:00 | Car / Small Van <5.2 metres | 30 |
| 9 | 2016-06-01 02:00:00 | Car / Small Van <5.2 metres with Trailer | 2 |
| 10 | 2016-06-01 02:00:00 | Rigid HGV + Trailer / Standard Articulated (Ca... | 39 |
| 11 | 2016-06-01 02:00:00 | Rigid HGV / Large Van | 17 |
| 12 | 2016-06-01 03:00:00 | Car / Small Van <5.2 metres | 61 |
| 13 | 2016-06-01 03:00:00 | Car / Small Van <5.2 metres with Trailer | 1 |
| 14 | 2016-06-01 03:00:00 | Rigid HGV + Trailer / Standard Articulated (Ca... | 36 |
# Work out the vehicle count per class and direction for each hour
hourly_counts_d = original_data.groupby([pd.TimeGrouper('1H'), 'Info', 'Direction'])['Count'].count().reset_index()
hourly_counts_d.head(15)
| DateTime | Info | Direction | Count | |
|---|---|---|---|---|
| 0 | 2016-06-01 00:00:00 | Car / Small Van <5.2 metres | N | 26 |
| 1 | 2016-06-01 00:00:00 | Car / Small Van <5.2 metres | S | 22 |
| 2 | 2016-06-01 00:00:00 | Car / Small Van <5.2 metres with Trailer | N | 2 |
| 3 | 2016-06-01 00:00:00 | Rigid HGV + Trailer / Standard Articulated (Ca... | N | 15 |
| 4 | 2016-06-01 00:00:00 | Rigid HGV + Trailer / Standard Articulated (Ca... | S | 12 |
| 5 | 2016-06-01 00:00:00 | Rigid HGV / Large Van | N | 8 |
| 6 | 2016-06-01 00:00:00 | Rigid HGV / Large Van | S | 3 |
| 7 | 2016-06-01 01:00:00 | Car / Small Van <5.2 metres | N | 19 |
| 8 | 2016-06-01 01:00:00 | Car / Small Van <5.2 metres | S | 16 |
| 9 | 2016-06-01 01:00:00 | Car / Small Van <5.2 metres with Trailer | N | 1 |
| 10 | 2016-06-01 01:00:00 | Car / Small Van <5.2 metres with Trailer | S | 2 |
| 11 | 2016-06-01 01:00:00 | Rigid HGV + Trailer / Standard Articulated (Ca... | N | 14 |
| 12 | 2016-06-01 01:00:00 | Rigid HGV + Trailer / Standard Articulated (Ca... | S | 17 |
| 13 | 2016-06-01 01:00:00 | Rigid HGV / Large Van | N | 7 |
| 14 | 2016-06-01 01:00:00 | Rigid HGV / Large Van | S | 2 |
hourly_counts.to_csv('JTC00567 Hourly Summaries.csv', index=False)
hourly_counts_d.to_csv('JTC00567 Directional Hourly Summaries.csv', index=False)
DataFrames to Excel# Create a "writer"
excel_writer = pd.ExcelWriter('JTC00567.xlsx')
# Write each DataFrame to the writer separately, specifying the name we want to use for the worksheet
original_data.to_excel(excel_writer, 'Raw Counts', index=False)
hourly_counts.to_excel(excel_writer, 'Hourly Counts', index=False)
hourly_counts_d.to_excel(excel_writer, 'Hourly Counts_Directional', index=False)
# Save the "writer"
excel_writer.save()
matplotlib is a plotting library for Python, allowing for the production of a wide range of graphsseaborn extends matplotlib by adding support for further types of graphs, improved interaction with pandas dataframes and generally by making matplotlib graphs prettierseaborn into our project, we use its colour schemes even when we plot graphs with matplotlib (which in some cases is easier)matplotlib can plot line charts from our data, and has a special interface for dealing with datesgroupby# Set up the graph size
fig, ax = plt.subplots(figsize=(15, 5))
# Get the data for each class in the DataFrame
for vehicle_class, data in hourly_counts.groupby('Info'):
# Plot the data for that class
ax.plot_date(data['DateTime'], data['Count'], label=vehicle_class, linestyle='-', marker='')
# Set up legend
plt.legend(frameon=True, facecolor='white',
loc='lower center', ncol=2, bbox_to_anchor=(0.5, -0.6))
# Set up axis labels
plt.xlabel('Date')
plt.ylabel('Vehicles')
# Fix margins properly
plt.ylim(bottom=0)
ax.margins(x=0)
plt.title('Total Flow - JTC00567')
<matplotlib.text.Text at 0x31182dd8>
subplots# Set up the subplots
fig, all_axes = plt.subplots(2, figsize=(15, 10), sharex=True, sharey=True)
# Get the data for each class in the DataFrame
for i, (direction, directional_data) in enumerate(hourly_counts_d.groupby('Direction')):
for vehicle_class, data in directional_data.groupby('Info'):
# Plot the data for that class
all_axes[i].plot_date(data['DateTime'], data['Count'], label=vehicle_class, linestyle='-', marker='')
all_axes[i].margins(x=0)
all_axes[i].set_title(direction)
all_axes[i].set_ylabel('Vehicles')
# Set up legend
plt.legend(frameon=True, facecolor='white',
loc='lower center', ncol=2, bbox_to_anchor=(0.5, -0.7))
# Set up labels
plt.xlabel('Date')
fig.suptitle('Flow by Direction - JTC00567')
# Fix margins properly
plt.ylim(bottom=0)
(0, 1582.3)
pandas includes a method that lets a DataFrame be exported as a histogram# Set up the graph size
fig, ax = plt.subplots(figsize=(15, 5))
# Filter our original DataFrame to only contain cars (class 2)
cars = original_data[original_data['Class'] == '2']
# Set up the bins we want to use in our histogram
bin_size = 10
bins = range(0,max(cars['SpeedMPH']) + bin_size, bin_size)
# Plot the histogram
cars.hist(column='SpeedMPH', bins=bins, ax=ax, alpha=0.5)
# Set up labels and title
plt.xlabel('Speed (mph)')
plt.ylabel('Number of Vehicles')
plt.title('Car Speeds Histogram')
# Fix the margins
ax.margins(x=0)
seaborn over matplotlib is the ease with which heatmaps can be madenb_cars = hourly_counts_d[(hourly_counts_d['Info'] == 'Car / Small Van <5.2 metres') &
(hourly_counts_d['Direction'] == 'N')].copy()
nb_cars['Day'] = nb_cars['DateTime'].dt.weekday_name
nb_cars['Hour'] = nb_cars['DateTime'].dt.time#hour.astype('str')
nb_cars.head(15)
| DateTime | Info | Direction | Count | Day | Hour | |
|---|---|---|---|---|---|---|
| 0 | 2016-06-01 00:00:00 | Car / Small Van <5.2 metres | N | 26 | Wednesday | 00:00:00 |
| 7 | 2016-06-01 01:00:00 | Car / Small Van <5.2 metres | N | 19 | Wednesday | 01:00:00 |
| 15 | 2016-06-01 02:00:00 | Car / Small Van <5.2 metres | N | 19 | Wednesday | 02:00:00 |
| 23 | 2016-06-01 03:00:00 | Car / Small Van <5.2 metres | N | 38 | Wednesday | 03:00:00 |
| 31 | 2016-06-01 04:00:00 | Car / Small Van <5.2 metres | N | 79 | Wednesday | 04:00:00 |
| 43 | 2016-06-01 05:00:00 | Car / Small Van <5.2 metres | N | 256 | Wednesday | 05:00:00 |
| 54 | 2016-06-01 06:00:00 | Car / Small Van <5.2 metres | N | 561 | Wednesday | 06:00:00 |
| 66 | 2016-06-01 07:00:00 | Car / Small Van <5.2 metres | N | 696 | Wednesday | 07:00:00 |
| 78 | 2016-06-01 08:00:00 | Car / Small Van <5.2 metres | N | 688 | Wednesday | 08:00:00 |
| 90 | 2016-06-01 09:00:00 | Car / Small Van <5.2 metres | N | 767 | Wednesday | 09:00:00 |
| 102 | 2016-06-01 10:00:00 | Car / Small Van <5.2 metres | N | 900 | Wednesday | 10:00:00 |
| 114 | 2016-06-01 11:00:00 | Car / Small Van <5.2 metres | N | 818 | Wednesday | 11:00:00 |
| 126 | 2016-06-01 12:00:00 | Car / Small Van <5.2 metres | N | 811 | Wednesday | 12:00:00 |
| 139 | 2016-06-01 13:00:00 | Car / Small Van <5.2 metres | N | 852 | Wednesday | 13:00:00 |
| 151 | 2016-06-01 14:00:00 | Car / Small Van <5.2 metres | N | 874 | Wednesday | 14:00:00 |
column_order = ['Sunday', 'Monday', 'Tuesday', 'Wednesday', 'Thursday', 'Friday', 'Saturday']
pivot = nb_cars.pivot(index='Day', columns='Hour', values='Count')
pivot = pivot.reindex_axis(column_order, axis=0)
pivot
| Hour | 00:00:00 | 01:00:00 | 02:00:00 | 03:00:00 | 04:00:00 | 05:00:00 | 06:00:00 | 07:00:00 | 08:00:00 | 09:00:00 | ... | 14:00:00 | 15:00:00 | 16:00:00 | 17:00:00 | 18:00:00 | 19:00:00 | 20:00:00 | 21:00:00 | 22:00:00 | 23:00:00 |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| Day | |||||||||||||||||||||
| Sunday | 74 | 39 | 30 | 32 | 41 | 101 | 215 | 325 | 453 | 701 | ... | 830 | 838 | 839 | 615 | 524 | 392 | 305 | 154 | 85 | 80 |
| Monday | 40 | 16 | 22 | 46 | 176 | 382 | 543 | 716 | 697 | 868 | ... | 924 | 1073 | 1173 | 667 | 523 | 363 | 291 | 151 | 81 | 48 |
| Tuesday | 22 | 19 | 20 | 23 | 82 | 305 | 515 | 631 | 649 | 709 | ... | 828 | 1001 | 1150 | 690 | 516 | 408 | 290 | 177 | 67 | 69 |
| Wednesday | 26 | 19 | 19 | 38 | 79 | 256 | 561 | 696 | 688 | 767 | ... | 874 | 1062 | 1130 | 797 | 561 | 397 | 309 | 174 | 114 | 124 |
| Thursday | 62 | 26 | 15 | 37 | 83 | 292 | 536 | 648 | 715 | 818 | ... | 927 | 1093 | 1168 | 861 | 635 | 482 | 313 | 212 | 102 | 43 |
| Friday | 20 | 23 | 24 | 33 | 77 | 242 | 475 | 635 | 631 | 839 | ... | 1226 | 1320 | 1507 | 1278 | 852 | 542 | 337 | 253 | 146 | 88 |
| Saturday | 48 | 29 | 36 | 37 | 85 | 186 | 351 | 531 | 756 | 1163 | ... | 970 | 914 | 775 | 613 | 461 | 320 | 218 | 202 | 151 | 108 |
7 rows × 24 columns
# Set up the graph size
fig, ax = plt.subplots(figsize=(15, 5))
sns.heatmap(pivot)
<matplotlib.axes._subplots.AxesSubplot at 0x2a5906a0>
# Set up the graph size
fig, ax = plt.subplots(figsize=(15, 5))
sns.heatmap(pivot, annot=True, fmt='d', annot_kws={'size': 12})
<matplotlib.axes._subplots.AxesSubplot at 0x2ae20e48>
import os
all_data = pd.concat([pd.read_csv(os.path.join('VBV_Data', f), low_memory=False)
for f in os.listdir('VBV_Data')
if f.startswith('VBV')])
all_cars = all_data[all_data['Info'].str.contains('Car')]
# Set up the graph size
fig, ax = plt.subplots(figsize=(15, 8))
box_plot = sns.boxplot(data=all_cars, y='SpeedMPH', x='SiteName')
for lbl in box_plot.get_xticklabels():
lbl.set_rotation(90)
ax.margins(y=0)
plotly is a different data visualisation library that allows for the creation of interactive graphslines = []
for vehicle_class, data in hourly_counts.groupby('Info'):
# Plot the data for that class
#ax.plot_date(data['DateTime'], data['Count'], label=vehicle_class, linestyle='-', marker='')
line = go.Scatter(x=data['DateTime'], y=data['Count'], name=vehicle_class)
lines.append(line)
layout = go.Layout(legend=dict(orientation='h',
xanchor='center',
y=-0.2,
x=0.5),
xaxis=dict(title='Date'),
yaxis=dict(title='Vehicle Count'),
autosize=False,
width=1000,
height=600)
fig = go.Figure(data=lines, layout=layout)
plotly.offline.iplot(fig)
hist = go.Histogram(x=cars['SpeedMPH'], opacity=0.5,
autobinx=False, xbins=dict(start=0,
end=max(cars['SpeedMPH']),
size=10))
layout = go.Layout(xaxis=dict(title='Speed (mph)'),
yaxis=dict(title='Number of Vehicles'),
title='Car Speeds Histogram',
autosize=False,
width=1000,
height=600)
figure = go.Figure(data=[hist], layout=layout)
plotly.offline.iplot(figure)
column_order = ['Sunday', 'Monday', 'Tuesday', 'Wednesday', 'Thursday', 'Friday', 'Saturday']
nb_cars['Day'] = pd.Categorical(nb_cars['Day'],
categories=column_order,
ordered=True)
nb_cars.sort_values(['Day', 'Hour'], inplace=True)
heat = go.Heatmap(x=nb_cars['Hour'],
y=nb_cars['Day'],
z=nb_cars['Count'],
hoverinfo='z')
layout = go.Layout(xaxis=dict(title='Hour'),
yaxis=dict(autorange='reversed'),
title='Car Speeds Histogram',
autosize=False,
width=1000,
height=600)
figure = go.Figure(data=[heat], layout=layout)
plotly.offline.iplot(figure)
boxes = [go.Box(y=data['SpeedMPH'], name=site)
for site, data in all_cars.groupby('SiteName')]
layout = go.Layout(xaxis=dict(title='Site'),
yaxis=dict(title='Speed (mph)'),
title='Car Speeds Box Plot',
autosize=False,
width=1000,
height=600)
figure = go.Figure(data=boxes, layout=layout)
plotly.offline.iplot(figure)